Murine convergent substitutions

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
library(kableExtra)
library(ggsignif)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
library(vroom)
library(ggdist)
#library(stringr)
source("../lib/design.r")
source("../lib/get_tree_info.r")

< Back to summary

tree_type = "astral"
save_tree_fig = F

cat(tree_type, " tree\n")
## astral  tree
cat("188 species targeted capture to mouse exons assembled with SPADES.\n")
## 188 species targeted capture to mouse exons assembled with SPADES.
cat("11,775 coding loci aligned with exonerate+mafft\n")
## 11,775 coding loci aligned with exonerate+mafft
cat("Gene trees inferred with IQtree.\n")
## Gene trees inferred with IQtree.
if(tree_type == "astral"){
  cat("Species tree inferred with ASTRAL.\n")
  cat("Branch lengths estimated using 865 loci with normalized RF distance to species tree <= 0.25.\n")
  tree_file = "../../data/trees/full_coding_iqtree_astral.cf.bl.nrf25.rooted.labeled.treefile"
  convergence_branch_file = "../../data/convergence/full-coding-astral-pairwise-convergence/substitutions-by-branch-pairs.csv"
  convergence_branch_nodesc_file = "../../data/convergence/full-coding-astral-pairwise-convergence-nodesc/substitutions-by-branch-pairs.csv"
  convergence_gene_file = "../../data/convergence/full-coding-astral-pairwise-convergence-nodesc/substitution-pairs-by-gene.csv"
  branch_count_file = "../../data/rates/full-coding-astral-mg94-local-branch-mns.csv"
  
  col_branches_file = "../../data/trees/astral-colonization-branches.txt"
  
  morpho_branches_file = "../../data/trees/astral-moprho-ou-shift-branches.txt"
  morpho_genes_file = "../../data/datasets/subset_morphofacial_genes.txt"
  
  arid_genes_file = "../../data/datasets/arid-genes-giorello-2018-pid.txt"
  arid_branches_file = ""
  
  busted_genes_file = "../../data/ps/busted-full-gene-list.txt"
  
  exclude_branches = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
  xmax = 0.125
}else if(tree_type == "concat"){
  cat("Species tree inferred by concatenation of all loci with IQtree.\n")
  tree_file = "../../data/trees/full_coding_iqtree_concat.cf.rooted.tree"
  anc_file = "../../data/rates/full-coding-concat-pairwise-convergence.csv"
  branch_count_file = ""
  exclude_branches = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
  xmax = 0.125
}
## Species tree inferred with ASTRAL.
## Branch lengths estimated using 865 loci with normalized RF distance to species tree <= 0.25.
cat("Ancestral sequences inferred with HyPhy\n")
## Ancestral sequences inferred with HyPhy
tree = read.tree(tree_file)
tree_to_df_list = treeToDF(tree)
tree_info = tree_to_df_list[["info"]]

if(tree_type == "astral"){
  tree_info = tree_info %>% separate(label, c("tp", "astral", "gcf", "scf"), sep="/", remove=F)
  tree_info$astral[tree_info$node.type=="tip"] = NA
  tree_info$astral = as.numeric(tree_info$astral)
  tree_info$gcf = as.numeric(tree_info$gcf)
  tree_info$scf = as.numeric(tree_info$scf)
}else if(tree_type == "concat"){
  tree_info = tree_info %>% separate(label, c("bootstrap", "gcf", "scf"), sep="/", remove=F)
  tree_info$bootstrap[tree_info$node.type=="tip"] = NA
  tree_info$bootstrap = as.numeric(tree_info$bootstrap)
  tree_info$gcf = as.numeric(tree_info$gcf)
  tree_info$scf = as.numeric(tree_info$scf)
}

1 Definitions

  1. Divergent substitution: A substitution at the same site on two branches of the tree to a different derived state regardless of ancestral state (e.g. A,A -> C,G or A,T -> C,G).
  2. Convergent substitution: A substitution at the same site on two branches of the tree that differ in their ancestral state to an identical derived state (e.g. A,G -> C,C).
  3. Parllel substitution: A substitution at the same site on two branches of the tree that share the same ancestral state to an identical derived state (e.g. A,A -> C,C).

For all pairwise analyses, comparisons between branches where one is the direct descendant of the other are excluded.

2 Species tree branch presence/absence per gene

Following the same procedure as before, we count convergent substitutions per species tree branch only in genes that contain that branch (full clade or partial clade).

3 Pairwise amino acid convergence

3.1 All pairs of branches except direct descendants

Convergent and divergent amino acid substitutions were counted on each pair of branches in the species tree in every gene where both branches were present, excluding pairs where one branch is the direct descendant of the other.

3.1.1 Divergent vs. convergent substitutions

We find a strong correlation between divergent and convergent amino acid substitutions – with a secondary correlation (orange points).

conv_branches = read.csv(convergence_branch_file, header=T, comment.char="#")
# Read the pairwise counts

cols_to_na = names(conv_branches)[3:80]
for(col in cols_to_na){
  conv_branches[conv_branches$node1 %in% exclude_branches,][[col]] = NA
  conv_branches[conv_branches$node2 %in% exclude_branches,][[col]] = NA
}
# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them

conv_branches = conv_branches %>% mutate(aa.div = rowSums(select(., ends_with(".div")), na.rm=T))
conv_branches = conv_branches %>% mutate(aa.par = rowSums(select(., ends_with(".par")), na.rm=T))
conv_branches = conv_branches %>% mutate(aa.con = rowSums(select(., ends_with(".con")), na.rm=T))

second_cor = subset(conv_branches, aa.con > 200)
#second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
second_cor = rbind(second_cor, subset(conv_branches, aa.div < 1000 & aa.con > 100))
second_cor = rbind(second_cor, subset(conv_branches, aa.div < 400 & aa.con > 50))
second_cor = rbind(second_cor, subset(conv_branches, aa.div < 200 & aa.con > 30))

aa_p = ggplot(conv_branches, aes(x=aa.div, y=aa.con)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("Divergent AA substitutions") +
  ylab("Convergent AA substitutions") +
  bartheme()

print(aa_p)

3.1.2 Divergent vs. parallel substitutions

No secondary correlation between divergent and parallel (same ancestral state) substitutions, though those pairs seem to cluster on the lower end of parallel substitutions.

second_cor_par = subset(conv_branches, aa.con > 200)
#second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
second_cor_par = rbind(second_cor_par, subset(conv_branches, aa.div < 1000 & aa.con > 100))
second_cor_par = rbind(second_cor_par, subset(conv_branches, aa.div < 400 & aa.con > 50))
second_cor_par = rbind(second_cor_par, subset(conv_branches, aa.div < 200 & aa.con > 30))

aa_p = ggplot(conv_branches, aes(x=aa.div, y=aa.par)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  geom_point(data=second_cor_par, aes(x=aa.div, y=aa.par), size=2, alpha=0.3, color=corecol(numcol=1)) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  #geom_smooth(data=second_cor_par, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("Divergent AA substitutions") +
  ylab("Parallel AA substitutions") +
  bartheme()

print(aa_p)

second_cor_br = select(second_cor, node1, node2)
second_cor_br_counts = data.frame(table(unlist(second_cor_br)))
names(second_cor_br_counts) = c("tp", "conv.pair.count")

#second_cor_p = ggplot(second_cor_br_counts, aes(x=tp, y=conv.pair.count)) + 
#  geom_bar(stat="identity") +
#  bartheme()
#print(second_cor_p)

uniq_info_cols = names(second_cor_br_counts)[!(names(second_cor_br_counts) %in% names(tree_info))] 
# # Get non common names
 
uniq_info_cols = c(uniq_info_cols, "tp") # appending key parameter
# Get a list of columns from the tree_info df to join to the tree rates df
 
tree_info = tree_info %>% left_join((second_cor_br_counts %>% select(uniq_info_cols)), by="tp")
# Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157

tree_info = tree_info[order(tree_info$node), ]
# Re-order the rows by the R node

tree_info$conv.pair.count[is.na(tree_info$conv.pair.count)] = 0

3.1.3 Branches involved in secondary correlation

Remember that each point above is a count between a PAIR of branches. This tree displays the number of times each branch in the tree is involved in one of the pairings in the secondary correlation (orange points).

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

conv_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=tree_info$conv.pair.count)) +
  scale_color_continuous(name='Times in secondary pair', low=l, high=h) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  geom_label(aes(x=branch, label=ifelse(tree_info$conv.pair.count>0,as.character(tree_info$conv.pair.count),'')), label.size=NA, fill="transparent") +
  theme(legend.position=c(0.15,0.9))
print(conv_tree)

if(save_tree_fig){
  gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
  tree_outfile = paste("../../data/trees/", tree_type, "-gcf-tree-PARED-gcf50.pdf", sep="")
  ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}
# conv tree

3.1.4 Secondary correlation and branch length

No correlation between branch length and the number of times a branch is on this secondary correlation.

Note: ALL branches plotted, but still no correlation if only those with at least 1 pairing in the secondary correlation are used

secondary_tree_info = subset(tree_info, conv.pair.count > 0)

conv_bl_p = ggplot(tree_info, aes(x=branch.length, y=conv.pair.count)) +
  geom_point(size=2, alpha=0.3, color=corecol(numcol=1)) +
  xlab("Branch length") +
  ylab("# of times in second correlation") +
  bartheme()
print(conv_bl_p)

3.1.5 Secondary correlation and number of loci branch present in

No correlation between branch presence and the number of times a branch is on this secondary correlation.

Note: ALL branches plotted, but still no correlation if only those with at least 1 pairing in the secondary correlation are used

branch_presence = read.csv(branch_count_file, header=T)
names(branch_presence)[1] = "tp"

uniq_info_cols = names(branch_presence)[!(names(branch_presence) %in% names(tree_info))] 
# # Get non common names
 
uniq_info_cols = c(uniq_info_cols, "tp") # appending key parameter
# Get a list of columns from the tree_info df to join to the tree rates df
 
tree_info = tree_info %>% left_join((branch_presence %>% select(uniq_info_cols)), by="tp")
# Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157

tree_info = tree_info[order(tree_info$node), ]
# Re-order the rows by the R node


secondary_tree_info = subset(tree_info, conv.pair.count > 0)

conv_bp_p = ggplot(tree_info, aes(x=num.genes.full+num.genes.partial, y=conv.pair.count)) +
  geom_point(size=2, alpha=0.3, color=corecol(numcol=1)) +
  xlab("# of times branch in gene") +
  ylab("# of times in second correlation") +
  bartheme()
print(conv_bp_p)

3.1.6 Convergent substitutions and number of time branch pairs are present in gene trees

#secondary_tree_info = subset(tree_info, conv.pair.count > 0)

conv_pairs_p = ggplot(conv_branches, aes(x=count, y=aa.con)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  geom_point(data=second_cor, aes(x=count, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
  #geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  #geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("# of times branches appear together") +
  ylab("Convergent AA substitutions") +
  bartheme()

print(conv_pairs_p)

3.1.7 Excluding convergent AA substitutions that are divergent at the codon level

This pattern, and actually most convergent amino acid substitutions, are actually driven by divergent substitutions at the codon level:

#conv_branches_nodesc = read.csv(convergence_branch_file, header=T, comment.char="#")
# Read the pairwise counts

#cols_to_na = names(conv_branches_nodesc)[3:80]
#for(col in cols_to_na){
#  anc_info_w_root[conv_branches_nodesc$node1 %in% exclude_branches,][[col]] = NA
#  anc_info_w_root[conv_branches_nodesc$node2 %in% exclude_branches,][[col]] = NA
#}
# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them

conv_branches = conv_branches %>% mutate(aa.div = rowSums(select(., ends_with(".div.div")), na.rm=T))
conv_branches = conv_branches %>% mutate(aa.par = rowSums(select(., ends_with(".par.par")), na.rm=T))
conv_branches = conv_branches %>% mutate(aa.con = rowSums(select(., ends_with(".con.con")), na.rm=T))

#second_cor = subset(anc_info_w_root, aa.con > 200)

aa_p = ggplot(conv_branches, aes(x=aa.div, y=aa.con)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  #geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  #geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("Divergent AA substitutions") +
  ylab("Convergent AA substitutions") +
  bartheme()

print(aa_p)

3.1.8 Including only convergent amino acid substitutions that are divergent at the codon level

#anc_info_w_root = read.csv(anc_file, header=T, comment.char="#")
# Read the pairwise counts

#cols_to_na = names(anc_info_w_root)[3:80]
#for(col in cols_to_na){
#  anc_info_w_root[anc_info_w_root$node1 %in% exclude_branches,][[col]] = NA
#  anc_info_w_root[anc_info_w_root$node2 %in% exclude_branches,][[col]] = NA
#}
# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them

#anc_info_w_root = anc_info_w_root %>% mutate(aa.div = rowSums(select(., ends_with(".div.div")), na.rm=T))
conv_branches = conv_branches %>% mutate(aa.con = rowSums(select(., ends_with(".div.con")), na.rm=T))

#second_cor = subset(anc_info_w_root, aa.con > 200)

aa_p = ggplot(conv_branches, aes(x=aa.div, y=aa.con)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  #geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  #geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("Divergent AA substitutions") +
  ylab("Convergent AA substitutions") +
  bartheme()

print(aa_p)

3.2 All pairs of branches except any descendants

Excluding descendant branches removes the secondary correlation

3.2.1 Divergent vs. convergent substitutions

conv_branches_nodesc = read.csv(convergence_branch_nodesc_file, header=T, comment.char="#")
# Read the pairwise counts

cols_to_na = names(conv_branches_nodesc)[3:80]
for(col in cols_to_na){
  conv_branches_nodesc[conv_branches_nodesc$node1 %in% exclude_branches,][[col]] = NA
  conv_branches_nodesc[conv_branches_nodesc$node2 %in% exclude_branches,][[col]] = NA
}
# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them

conv_branches_nodesc = conv_branches_nodesc %>% mutate(aa.div = rowSums(select(., ends_with(".div")), na.rm=T))
conv_branches_nodesc = conv_branches_nodesc %>% mutate(aa.par = rowSums(select(., ends_with(".par")), na.rm=T))
conv_branches_nodesc = conv_branches_nodesc %>% mutate(aa.con = rowSums(select(., ends_with(".con")), na.rm=T))

second_cor = subset(conv_branches_nodesc, aa.con > 200)
#second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
second_cor = rbind(second_cor, subset(conv_branches_nodesc, aa.div < 1000 & aa.con > 100))
second_cor = rbind(second_cor, subset(conv_branches_nodesc, aa.div < 400 & aa.con > 50))
second_cor = rbind(second_cor, subset(conv_branches_nodesc, aa.div < 200 & aa.con > 30))

aa_p = ggplot(conv_branches_nodesc, aes(x=aa.div, y=aa.con)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("Divergent AA substitutions") +
  ylab("Convergent AA substitutions") +
  bartheme()

print(aa_p)

3.2.2 Divergent vs. parallel substitutions

second_cor_par = subset(conv_branches_nodesc, aa.con > 200)
#second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
second_cor_par = rbind(second_cor_par, subset(conv_branches_nodesc, aa.div < 1000 & aa.con > 100))
second_cor_par = rbind(second_cor_par, subset(conv_branches_nodesc, aa.div < 400 & aa.con > 50))
second_cor_par = rbind(second_cor_par, subset(conv_branches_nodesc, aa.div < 200 & aa.con > 30))

aa_p = ggplot(conv_branches_nodesc, aes(x=aa.div, y=aa.par)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  geom_point(data=second_cor_par, aes(x=aa.div, y=aa.par), size=2, alpha=0.3, color=corecol(numcol=1)) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  #geom_smooth(data=second_cor_par, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("Divergent AA substitutions") +
  ylab("Parallel AA substitutions") +
  bartheme()

print(aa_p)

3.2.3 Rates of convergence by branch

conv_counts = data.frame("tp"=tree_info$tp)
conv_counts$aa.con.sum = 0
conv_counts$aa.div.sum = 0

for(i in 1:nrow(conv_branches_nodesc)){
  row = conv_branches_nodesc[i,]
  n1 = row$node1
  n2 = row$node2
  conv_counts[conv_counts$tp==n1,]$aa.con.sum = conv_counts[conv_counts$tp==n1,]$aa.con.sum + row$aa.con
  conv_counts[conv_counts$tp==n1,]$aa.div.sum = conv_counts[conv_counts$tp==n1,]$aa.div.sum + row$aa.div
  
  conv_counts[conv_counts$tp==n2,]$aa.con.sum = conv_counts[conv_counts$tp==n2,]$aa.con.sum + row$aa.con
  conv_counts[conv_counts$tp==n2,]$aa.div.sum = conv_counts[conv_counts$tp==n2,]$aa.div.sum + row$aa.div
}

tree_info = merge(tree_info, conv_counts, by="tp")

tree_info = tree_info[order(tree_info$node), ]
# Re-order the rows by the R node

tree_info$cd.ratio = tree_info$aa.con.sum / tree_info$aa.div.sum
tree_info[is.nan(tree_info$cd.ratio),]$cd.ratio = NA

conv_rates = ggplot(tree_info, aes(x=cd.ratio)) +
  geom_histogram(color="#ececec", fill="#333333") +
  scale_y_continuous(expand=c(0,0)) +
  xlab("Ratio of convergent to divergent substitutions") +
  ylab("# of branches") +
  bartheme()
print(conv_rates)

3.2.4 Rates of convergence on tree

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

conv_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=tree_info$cd.ratio)) +
  scale_color_continuous(name="Ratio of convergent to divergent substitutions", low=l, high=h) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  #geom_label(aes(x=branch, label=ifelse(tree_info$conv.pair.count>0,as.character(tree_info$conv.pair.count),'')), label.size=NA, fill="transparent") +
  theme(legend.position=c(0.15,0.9))
print(conv_tree)

if(save_tree_fig){
  gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
  tree_outfile = paste("../../data/trees/", tree_type, "-gcf-tree-PARED-gcf50.pdf", sep="")
  ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}
# conv tree

4 Amino acid convergence by gene

Excluding all descendant pair comparisons

Do morpho branches experience more convergence in morpho genes compared to all genes?

Do morpho branches experience more convergence in morpho genes compared to other branches?

col_branches = read.csv(col_branches_file, header=F, comment.char="#")
names(col_branches) = c("tp")

tree_info$col.branch = "Other"

tree_info$col.branch[tree_info$tp %in% col_branches$tp] = "Colonization"

for(i in 1:nrow(tree_info)){
  if(tree_info[i,]$node.type == "internal" && tree_info[i,]$col.branch == "Colonization"){
    cur_desc = getDescendants(tree, tree_info[i,]$node)
    tree_info$col.branch[tree_info$node==cur_desc[1]] = "Descendant"
    tree_info$col.branch[tree_info$node==cur_desc[2]] = "Descendant"
  }
}

morpho_branches = read.csv(morpho_branches_file, header=F, comment.char="#")
names(morpho_branches) = c("tp")

tree_info$morpho.branch = "No OU shift"
tree_info$morpho.branch[tree_info$tp %in% morpho_branches$tp] = "OU shift"
morpho_genes = read.csv(morpho_genes_file, header=F)
names(morpho_genes)[1] = "pid"

arid_genes = read.csv(arid_genes_file, header=F)
names(arid_genes)[1] = "pid"

busted_genes = read.csv(busted_genes_file, header=F)
names(busted_genes)[1] = "pid"
#gene_conv = vroom(convergence_gene_file, comment="#", col_names=F)
gene_conv = read.table(convergence_gene_file, header=T, sep=",")
#names(gene_conv) = c("pid", "branch1", "branch2", "sub.type")
#gene_conv = gene_conv %>% separate(file, c("pid", "aln", "ext"), sep="-", remove=T)

gene_conv_table = data.frame("Data"=c(), "Slope"=c(), "R-squared"=c())

4.1 Convergent vs. divergent substitutions

non_mf_branches_all_genes = subset(gene_conv, !node1 %in% morpho_branches$tp & !node2 %in% morpho_branches$tp)
non_mf_branches_all_genes$label = "All genes, non-MF branches"

non_mf_branches_all_genes_by_genes = non_mf_branches_all_genes %>% group_by(pid, aa.class) %>% tally()
non_mf_branches_all_genes_by_genes = subset(non_mf_branches_all_genes_by_genes, aa.class=="div" | aa.class=="con")
non_mf_branches_all_genes_by_genes = spread(non_mf_branches_all_genes_by_genes, key = aa.class, value = n, fill=0)
non_mf_branches_all_genes_by_genes$label = "All genes, non-MF branches"

non_mf_branches_all_genes_by_genes_fit = lm(non_mf_branches_all_genes_by_genes$con ~ non_mf_branches_all_genes_by_genes$div)

non_mf_branch_all_gene_p = ggplot(non_mf_branches_all_genes_by_genes, aes(x=div, y=con)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  #geom_point(data=mf_branches_all_genes_by_genes, aes(x=div, y=con), size=2, alpha=0.3, color=corecol(numcol=1)) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  #geom_smooth(data=mf_branches_all_genes_by_genes, aes(x=div, y=con), method="lm", size=2, linetype="dashed", color="#920000") +
  ggtitle("All genes, non-MF branches") +
  xlab("Divergent AA substitutions") +
  ylab("Convergent AA substitutions") +
  bartheme() +
  theme(axis.title=element_text(size=12),
        plot.title = element_text(hjust=0.5, size=14))

#print(non_mf_branch_all_gene_p)


gene_conv_table = rbind(gene_conv_table, data.frame("Data"=c("All genes, non-MF branches"), "Slope"=c(non_mf_branches_all_genes_by_genes_fit$coefficients[2]), "R-squared"=c(summary(non_mf_branches_all_genes_by_genes_fit)$adj.r.squared)))
non_mf_branches_mf_genes_by_genes = subset(non_mf_branches_all_genes_by_genes, pid %in% morpho_genes$pid)
non_mf_branches_mf_genes_by_genes$label = "MF genes, non-MF branches"

non_mf_branches_mf_genes_by_genes_fit = lm(non_mf_branches_mf_genes_by_genes$con ~ non_mf_branches_mf_genes_by_genes$div)

non_mf_branch_mf_gene_p = ggplot(non_mf_branches_mf_genes_by_genes, aes(x=div, y=con)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  #geom_point(data=mf_branches_all_genes_by_genes, aes(x=div, y=con), size=2, alpha=0.3, color=corecol(numcol=1)) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  #geom_smooth(data=mf_branches_all_genes_by_genes, aes(x=div, y=con), method="lm", size=2, linetype="dashed", color="#920000") +
  ggtitle("MF genes, non-MF branches") +
  xlab("Divergent AA substitutions") +
  ylab("Convergent AA substitutions") +
  bartheme() +
  theme(axis.title=element_text(size=12),
        plot.title = element_text(hjust=0.5, size=14))

#print(non_mf_branch_mf_gene_p)

gene_conv_table = rbind(gene_conv_table, data.frame("Data"=c("MF genes, non-MF branches"), "Slope"=c(non_mf_branches_mf_genes_by_genes_fit$coefficients[2]), "R-squared"=c(summary(non_mf_branches_mf_genes_by_genes_fit)$adj.r.squared)))
mf_branches_all_genes = subset(gene_conv, node1 %in% morpho_branches$tp & node2 %in% morpho_branches$tp)
mf_branches_all_genes$label = "All genes, MF branches"

mf_branches_all_genes_by_genes = mf_branches_all_genes %>% group_by(pid, aa.class) %>% tally()
mf_branches_all_genes_by_genes = subset(mf_branches_all_genes_by_genes, aa.class=="div" | aa.class=="con")
mf_branches_all_genes_by_genes = spread(mf_branches_all_genes_by_genes, key = aa.class, value = n, fill=0)
mf_branches_all_genes_by_genes$label = "All genes, MF branches"

mf_branches_all_genes_by_genes_fit = lm(mf_branches_all_genes_by_genes$con ~ mf_branches_all_genes_by_genes$div)

mf_branch_all_gene_p = ggplot(mf_branches_all_genes_by_genes, aes(x=div, y=con)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  #geom_point(data=mf_branches_all_genes_by_genes, aes(x=div, y=con), size=2, alpha=0.3, color=corecol(numcol=1)) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  #geom_smooth(data=mf_branches_all_genes_by_genes, aes(x=div, y=con), method="lm", size=2, linetype="dashed", color="#920000") +
  ggtitle("All genes, MF branches") +
  xlab("Divergent AA substitutions") +
  ylab("Convergent AA substitutions") +
  bartheme() +
  theme(axis.title=element_text(size=12),
        plot.title = element_text(hjust=0.5, size=14))

#print(mf_branch_all_gene_p)


gene_conv_table = rbind(gene_conv_table, data.frame("Data"=c("All genes, MF branches"), "Slope"=c(mf_branches_all_genes_by_genes_fit$coefficients[2]), "R-squared"=c(summary(mf_branches_all_genes_by_genes_fit)$adj.r.squared)))
mf_branches_mf_genes_by_genes = subset(mf_branches_all_genes_by_genes, pid %in% morpho_genes$pid)
mf_branches_mf_genes_by_genes$label = "MF genes, MF branches"

mf_branches_mf_genes_by_genes_fit = lm(mf_branches_mf_genes_by_genes$con ~ mf_branches_mf_genes_by_genes$div)

mf_branch_mf_gene_p = ggplot(mf_branches_mf_genes_by_genes, aes(x=div, y=con)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  #geom_point(data=mf_branches_all_genes_by_genes, aes(x=div, y=con), size=2, alpha=0.3, color=corecol(numcol=1)) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  #geom_smooth(data=mf_branches_all_genes_by_genes, aes(x=div, y=con), method="lm", size=2, linetype="dashed", color="#920000") +
  ggtitle("MF genes, MF branches") +
  xlab("Divergent AA substitutions") +
  ylab("Convergent AA substitutions") +
  bartheme() +
  theme(axis.title=element_text(size=12),
        plot.title = element_text(hjust=0.5, size=14))

#print(mf_branch_mf_gene_p)

gene_conv_table = rbind(gene_conv_table, data.frame("Data"=c("MF genes, MF branches"), "Slope"=c(mf_branches_mf_genes_by_genes_fit$coefficients[2]), "R-squared"=c(summary(mf_branches_mf_genes_by_genes_fit)$adj.r.squared)))
gene_conv_plots = plot_grid(non_mf_branch_all_gene_p, mf_branch_all_gene_p, non_mf_branch_mf_gene_p, mf_branch_mf_gene_p, ncol=2)
print(gene_conv_plots)

gene_conv_table %>% kable(row.names=F) %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)
Data Slope R.squared
All genes, non-MF branches 0.0568405 0.8032369
MF genes, non-MF branches 0.0590619 0.8297499
All genes, MF branches 0.0192501 0.0093570
MF genes, MF branches 0.0272814 0.0109804

4.2 Convergent substitutions

conv_counts = rbind(non_mf_branches_all_genes_by_genes, non_mf_branches_mf_genes_by_genes, mf_branches_all_genes_by_genes, mf_branches_mf_genes_by_genes)
# Convert branch categories to long format

conv_counts$label = factor(conv_counts$label, levels=c("All genes, non-MF branches", "All genes, MF branches", "MF genes, non-MF branches", "MF genes, MF branches"))

x_comps = list(c("All genes, non-MF branches", "All genes, MF branches"), c("All genes, MF branches", "MF genes, MF branches"), c("MF genes, non-MF branches", "MF genes, MF branches"))

gene_conv_counts = ggplot(conv_counts, aes(x=label, y=con, group=label)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25) +
  geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  ylab("# of convergent substitutions\nper gene") +
  xlab("") +
  bartheme() +
  theme(axis.text.x = element_text(angle=15, hjust=1)) +
  guides(colour=guide_legend(override.aes=list(alpha=1)))
print(gene_conv_counts)

## Convergent substitutions

conv_counts = rbind(non_mf_branches_all_genes_by_genes, non_mf_branches_mf_genes_by_genes, mf_branches_all_genes_by_genes, mf_branches_mf_genes_by_genes)
# Convert branch categories to long format

conv_counts$label = factor(conv_counts$label, levels=c("All genes, non-MF branches", "All genes, MF branches", "MF genes, non-MF branches", "MF genes, MF branches"))

x_comps = list(c("All genes, non-MF branches", "All genes, MF branches"), c("All genes, MF branches", "MF genes, MF branches"), c("MF genes, non-MF branches", "MF genes, MF branches"))

gene_conv_counts = ggplot(conv_counts, aes(x=label, y=con/div, group=label)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25) +
  geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  ylab("Ratio of convergent to\ndivergent substitutions\nper gene") +
  xlab("") +
  bartheme() +
  theme(axis.text.x = element_text(angle=15, hjust=1)) +
  guides(colour=guide_legend(override.aes=list(alpha=1)))
print(gene_conv_counts)

5 Codon convergence

Excluding all descendant pair comparisons

5.1 Non-synonymous codon convergence

conv_branches_nodesc = conv_branches_nodesc %>% mutate(syn.con = rowSums(select(., starts_with("s.s.con")), na.rm=T))
#anc_info_w_root = anc_info_w_root %>% mutate(non.con = rowSums(select(., starts_with("n.n.con") | starts_with("s.n.con")), na.rm=T))
#anc_info_w_root = anc_info_w_root %>% mutate(non.div = rowSums(select(., starts_with("n.n.div") | starts_with("s.n.div")), na.rm=T))

conv_branches_nodesc = conv_branches_nodesc %>% mutate(non.con = rowSums(select(., starts_with("n.n.con")), na.rm=T))
conv_branches_nodesc = conv_branches_nodesc %>% mutate(non.div = rowSums(select(., starts_with("n.n.div")), na.rm=T))

ns_codon_p = ggplot(conv_branches_nodesc, aes(x=non.div, y=non.con)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("Divergent non-synonymous codon substitutions") +
  ylab("Convergent non-synonymous codon substitutions") +
  bartheme()

print(ns_codon_p)

5.2 Non-synonymous codon convergence / synonymous codon convergence

For each pair, counts where both convergent substitutions are non-synonymous vs. both are synonymous

cncs_set = subset(conv_branches_nodesc, syn.con > 4)

cncs_set$cncs = cncs_set$non.con / cncs_set$syn.con
#cncs_set$cncs[is.na(cncs_set$cncs)] = NA
#cncs_set$cncs[is.infinite(cncs_set$cncs)] = NA

cncs_p = ggplot(cncs_set, aes(x=cncs)) +
  geom_histogram(fill=corecol(numcol=1, offset=1), color="#ececec") +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  xlab("cN/cS") +
  ylab("Branch pairs") +
  bartheme()

print(cncs_p)

cat("Avg. cN/cS: ", mean(cncs_set$cncs, na.rm=T), "\n")
## Avg. cN/cS:  0.2916045
cat("Median cN/cS: ", median(cncs_set$cncs, na.rm=T), "\n")
## Median cN/cS:  0.2380952
cncs_subset = select(cncs_set, node1, node2, non.con, syn.con, cncs)
cncs_subset = cncs_subset[order(-cncs_subset$cncs), ]

head(cncs_subset, n=20)
cncs_95_perc = quantile(cncs_set$cncs, 0.95, na.rm=T)

cat("cN/cS 95th percentile: ", cncs_95_perc, "\n")
## cN/cS 95th percentile:  0.7777778

5.3 Synonymous vs. non-synonymous codon convergence

Y = above 95th percentile of cN/cS

N = below 95th percentile of cN/cS

cncs_subset$cncs.95 = ifelse(cncs_subset$cncs > cncs_95_perc, "Y", "N")

s_ns_codon_p = ggplot(cncs_subset, aes(x=syn.con, y=non.con, color=cncs.95)) +
  geom_point(size=2, alpha=0.3) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("Convergent synonymous codon substitutions") +
  ylab("Convergent non-synonymous\ncodon substitutions") +
  bartheme()

print(s_ns_codon_p)

cs_95_perc = quantile(anc_info$cS, 0.95, na.rm=T)
cn_95_perc = quantile(anc_info$cN, 0.95, na.rm=T)

p = ggplot(anc_info, aes(x=cS, y=cN, color=node.type)) +
  geom_point(size=2, alpha=0.2) +
  geom_text_repel(aes(label=ifelse(cS>cs_95_perc | cN>cn_95_perc, as.character(node), '')), show_guide=F) +
  #(aes(label=ifelse(avg.dN>dn_95_perc, as.character(node), '')), show_guide=F) +
  geom_vline(xintercept=cs_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
  geom_hline(yintercept=cn_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
  #geom_smooth(method="lm", se=F, ) +
  xlab("cS per branch") + 
  ylab("cN per branch") + 
  bartheme() +
  theme(legend.position="bottom") +
  guides(colour = guide_legend(override.aes = list(alpha = 1)))
p = ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#666666")
print(p)

anc_info$cs.outlier = ifelse(anc_info$cS>cs_95_perc,anc_info$node,'')
anc_info$cn.outlier = ifelse(anc_info$cN>cn_95_perc,anc_info$node,'')
# Synonymous and non-synonymous convergent substitutions

#This results in the following distributions for number of convergent substitutions across branches (green lines indicating 95th percentile of each rate):
  
  
cs_95_perc = quantile(anc_info$cS, 0.95, na.rm=T)
cn_95_perc = quantile(anc_info$cN, 0.95, na.rm=T)

p = ggplot(anc_info, aes(x=cS, y=cN, color=node.type)) +
  geom_point(size=2, alpha=0.2) +
  geom_text_repel(aes(label=ifelse(cS>cs_95_perc | cN>cn_95_perc, as.character(node), '')), show_guide=F) +
  #(aes(label=ifelse(avg.dN>dn_95_perc, as.character(node), '')), show_guide=F) +
  geom_vline(xintercept=cs_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
  geom_hline(yintercept=cn_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
  #geom_smooth(method="lm", se=F, ) +
  xlab("cS per branch") + 
  ylab("cN per branch") + 
  bartheme() +
  theme(legend.position="bottom") +
  guides(colour = guide_legend(override.aes = list(alpha = 1)))
p = ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#666666")
print(p)

anc_info$cs.outlier = ifelse(anc_info$cS>cs_95_perc,anc_info$node,'')
anc_info$cn.outlier = ifelse(anc_info$cN>cn_95_perc,anc_info$node,'')
# Species tree with branches colored # of convergent substitutions to any other branch

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

c_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=anc_info_w_root$total.c)) +
  scale_color_continuous(name='Convergent subs', low=l, high=h) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9))
print(c_tree)
# Ratio of non-synyonymous to synonymous convergent substitutions (cN/cS)

cncs_p = ggplot(anc_info, aes(x=cn.cs)) +
  geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666") +
  scale_y_continuous(expand=c(0,0)) +
  xlab("cN/cS") +
  ylab("# of branches") +
  bartheme()
print(cncs_p)
# Distribution of dN/dS when using gene trees

cncs_95_perc = quantile(anc_info$cn.cs, 0.95, na.rm=T)
anc_info$cncs.outlier = ifelse(anc_info$cn.cs>cncs_95_perc,anc_info$node,'')
# Species tree with branches colored by cN/cS

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

cncs_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=anc_info_w_root$cn.cs)) +
  scale_color_continuous(name='cN/cS', low=l, high=h) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9))
print(cncs_tree)
# Pairwise convergent substitutions

# Convergent and divergent substitutions between every pair of branches in the tree


pairwise_data = read.csv("../../data/rates/")

pairwise_data$convergent = pairwise_data$con.syn + pairwise_data$con.one.each + pairwise_data$con.non.syn
pairwise_data$divergent = pairwise_data$div.syn + pairwise_data$div.one.each + pairwise_data$div.non.syn

pairwise_p = ggplot(pairwise_data, aes(x=divergent, y=convergent)) +
  geom_point(size=2, alpha=0.3, color="#920000") +
  geom_smooth(method="lm", se=F, size=2, linetype="dashed", color="#333333") +
  bartheme()
print(pairwise_p)

< Back to summary